{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 准备工作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from lxml.html import fromstring\n",
    "import time\n",
    "from random import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [],
   "source": [
    "# coding=utf-8\n",
    "from selenium import webdriver\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:19: DeprecationWarning: use options instead of chrome_options\n"
     ]
    }
   ],
   "source": [
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
    "\n",
    "\n",
    "#caps=dict()\n",
    "#caps[\"pageLoadStrategy\"] = \"none\"   # Do not wait for full page load\n",
    "\n",
    "opts = webdriver.ChromeOptions()\n",
    "opts.add_argument('--no-sandbox')#解决DevToolsActivePort文件不存在的报错\n",
    "opts.add_argument('window-size=1920x3000') #指定浏览器分辨率\n",
    "opts.add_argument('--disable-gpu') #谷歌文档提到需要加上一这个属性来规避bug\n",
    "opts.add_argument('--hide-scrollbars') #隐藏滚动条, 应对些特殊页面\n",
    "#opts.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度\n",
    "#opts.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败\n",
    "# opts.binary_location = \"C:\\portable\\PortableApps\\IronPortable\\App\\Iron\\chrome.exe\"\n",
    "# opts.binary_location = \"C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe\" #\"H:\\_coding_\\Gitee\\InternetNewMedia\\CapstonePrj2016\\chromedriver.exe\"  \n",
    "\n",
    "\n",
    "driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 进入知网高级检索页面\n",
    "driver.get(\"https://kns.cnki.net/kns8/AdvSearch?dbprefix=SCDB\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [],
   "source": [
    "#登陆中大南方账号\n",
    "driver.find_element_by_xpath('//*[@id=\"Ecp_top_login\"]/a').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//*[@id=\"Ecp_Button2\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'中山大学'"
      ]
     },
     "execution_count": 120,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 检查登陆状况\n",
    "driver.find_element_by_id('Ecp_loginShowName1').get_attribute('innerHTML')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击学术期刊\n",
    "driver.find_element_by_xpath('/html/body/div[3]/div[1]/div/ul[1]/li[1]/a/span').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [],
   "source": [
    "#点击专业检索\n",
    "driver.find_element_by_name('majorSearch').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 设置query\n",
    "query =\"SU %= '可持续发展'  AND SU %= '数字' OR SU %= '绿色营销'\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 输入检索内容\n",
    "element = driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/textarea')\n",
    "element.clear()\n",
    "element.send_keys(query)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 选择SCI,EI,北大核心,CSSCI,CSCD期刊\n",
    "driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div/label[2]/input').click()\n",
    "driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div/label[3]/input').click()\n",
    "driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div/label[4]/input').click()\n",
    "driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div/label[5]/input').click()\n",
    "driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div/label[6]/input').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击检索\n",
    "driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[2]/input').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [],
   "source": [
    "#按照被引数量文章排序\n",
    "driver.find_element_by_xpath('//*[@id=\"orderList\"]/li[3]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 更换每页文章数量\n",
    "element = driver.find_element_by_id('perPageDiv')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//li[@data-val=\"50\"]/a')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 提取前20页"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [],
   "source": [
    "pages = list(range(1,21))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [],
   "source": [
    "表格_html = dict()\n",
    "main_content =\"\"\n",
    "element = None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 翻页"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_pages (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "        # 定位到“下一页”的按钮 ——> 点击\n",
    "        跳转 = driver.find_element_by_id('PageNext')\n",
    "        跳转.click()\n",
    "        # 设定休息的时间 ——> 避免爬虫被禁报错、以及出现验证码\n",
    "        time.sleep(30+20*random())\n",
    "        # 获取含有页面主要数据的表格\n",
    "        element = driver.find_element_by_id('gridTable')\n",
    "        main_content = element.get_attribute('innerHTML')\n",
    "        表格_html[p] = main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\t12\t13\t14\t15\t16\t17\t18\t19\t20\t"
     ]
    }
   ],
   "source": [
    "process_pages(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        html_snippets\n",
       "1   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "2   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "3   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "4   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "5   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "6   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "7   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "8   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "9   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "10  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "11  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "12  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "13  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "14  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "15  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "16  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "17  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "18  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "19  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "20  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ..."
      ]
     },
     "execution_count": 135,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame([表格_html]).T\n",
    "df.columns = [\"html_snippets\"]\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {},
   "outputs": [],
   "source": [
    "l_df = []\n",
    "for p in pages:\n",
    "    表格 = pd.read_html(表格_html[p])[0]\n",
    "    l_df.append(表格)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>51</td>\n",
       "      <td>企业绿色营销系统的构建与绩效评价</td>\n",
       "      <td>司林胜</td>\n",
       "      <td>系统工程</td>\n",
       "      <td>2003-08-25</td>\n",
       "      <td>53</td>\n",
       "      <td>1087</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>52</td>\n",
       "      <td>试析技术性贸易壁垒及我国的应对策略</td>\n",
       "      <td>孙敬水</td>\n",
       "      <td>商业经济与管理</td>\n",
       "      <td>2002-06-25</td>\n",
       "      <td>52</td>\n",
       "      <td>508</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>53</td>\n",
       "      <td>绿色管理:21世纪企业管理研究的新领域</td>\n",
       "      <td>刘承伟</td>\n",
       "      <td>齐鲁学刊</td>\n",
       "      <td>2001-07-30</td>\n",
       "      <td>52</td>\n",
       "      <td>524</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>54</td>\n",
       "      <td>中国大陆地区古籍数字化问题及对策</td>\n",
       "      <td>高娟; 刘家真</td>\n",
       "      <td>中国图书馆学报</td>\n",
       "      <td>2013-07-15</td>\n",
       "      <td>51</td>\n",
       "      <td>2192</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>55</td>\n",
       "      <td>企业低碳绿色战略研究</td>\n",
       "      <td>吴维库; 李贞恩</td>\n",
       "      <td>经济纵横</td>\n",
       "      <td>2010-07-25</td>\n",
       "      <td>51</td>\n",
       "      <td>1207</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>995</th>\n",
       "      <td>1046</td>\n",
       "      <td>图书馆开源社区持续发展模式研究</td>\n",
       "      <td>陈大庆; 胡燕菘; 叶兰</td>\n",
       "      <td>图书馆学研究</td>\n",
       "      <td>2011-08-08</td>\n",
       "      <td>4</td>\n",
       "      <td>253</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>996</th>\n",
       "      <td>1047</td>\n",
       "      <td>科技期刊数字化独家经营之理性审视</td>\n",
       "      <td>梁华凝</td>\n",
       "      <td>科技与出版</td>\n",
       "      <td>2011-08-08</td>\n",
       "      <td>4</td>\n",
       "      <td>106</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>997</th>\n",
       "      <td>1048</td>\n",
       "      <td>湖北省大别山区特色农产品绿色营销策略分析</td>\n",
       "      <td>李爱玲</td>\n",
       "      <td>农业经济</td>\n",
       "      <td>2011-12-15</td>\n",
       "      <td>4</td>\n",
       "      <td>450</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>998</th>\n",
       "      <td>1049</td>\n",
       "      <td>基于当前我国中小企业营销的创新策略研究</td>\n",
       "      <td>蔡璐; 田芯; 王姝; 王红双</td>\n",
       "      <td>中国商贸</td>\n",
       "      <td>2011-12-01</td>\n",
       "      <td>4</td>\n",
       "      <td>158</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>999</th>\n",
       "      <td>1050</td>\n",
       "      <td>从比较优势到竞争优势——我国加工贸易发展战略研究</td>\n",
       "      <td>钟子建</td>\n",
       "      <td>生产力研究</td>\n",
       "      <td>2010-03-15</td>\n",
       "      <td>4</td>\n",
       "      <td>662</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1000 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Unnamed: 0                        篇名               作者       刊名  \\\n",
       "0            51          企业绿色营销系统的构建与绩效评价              司林胜     系统工程   \n",
       "1            52         试析技术性贸易壁垒及我国的应对策略              孙敬水  商业经济与管理   \n",
       "2            53       绿色管理:21世纪企业管理研究的新领域              刘承伟     齐鲁学刊   \n",
       "3            54          中国大陆地区古籍数字化问题及对策          高娟; 刘家真  中国图书馆学报   \n",
       "4            55                企业低碳绿色战略研究         吴维库; 李贞恩     经济纵横   \n",
       "..          ...                       ...              ...      ...   \n",
       "995        1046           图书馆开源社区持续发展模式研究     陈大庆; 胡燕菘; 叶兰   图书馆学研究   \n",
       "996        1047          科技期刊数字化独家经营之理性审视              梁华凝    科技与出版   \n",
       "997        1048      湖北省大别山区特色农产品绿色营销策略分析              李爱玲     农业经济   \n",
       "998        1049       基于当前我国中小企业营销的创新策略研究  蔡璐; 田芯; 王姝; 王红双     中国商贸   \n",
       "999        1050  从比较优势到竞争优势——我国加工贸易发展战略研究              钟子建    生产力研究   \n",
       "\n",
       "           发表时间  被引    下载  操作  \n",
       "0    2003-08-25  53  1087  下载  \n",
       "1    2002-06-25  52   508  下载  \n",
       "2    2001-07-30  52   524  下载  \n",
       "3    2013-07-15  51  2192  下载  \n",
       "4    2010-07-25  51  1207  下载  \n",
       "..          ...  ..   ...  ..  \n",
       "995  2011-08-08   4   253  下载  \n",
       "996  2011-08-08   4   106  下载  \n",
       "997  2011-12-15   4   450  下载  \n",
       "998  2011-12-01   4   158  下载  \n",
       "999  2010-03-15   4   662  下载  \n",
       "\n",
       "[1000 rows x 8 columns]"
      ]
     },
     "execution_count": 137,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out = pd.concat(l_df).reset_index(drop=True)\n",
    "df_url_out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>51</td>\n",
       "      <td>企业绿色营销系统的构建与绩效评价</td>\n",
       "      <td>司林胜</td>\n",
       "      <td>系统工程</td>\n",
       "      <td>2003-08-25</td>\n",
       "      <td>53</td>\n",
       "      <td>1087</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>52</td>\n",
       "      <td>试析技术性贸易壁垒及我国的应对策略</td>\n",
       "      <td>孙敬水</td>\n",
       "      <td>商业经济与管理</td>\n",
       "      <td>2002-06-25</td>\n",
       "      <td>52</td>\n",
       "      <td>508</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>53</td>\n",
       "      <td>绿色管理:21世纪企业管理研究的新领域</td>\n",
       "      <td>刘承伟</td>\n",
       "      <td>齐鲁学刊</td>\n",
       "      <td>2001-07-30</td>\n",
       "      <td>52</td>\n",
       "      <td>524</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>54</td>\n",
       "      <td>中国大陆地区古籍数字化问题及对策</td>\n",
       "      <td>高娟; 刘家真</td>\n",
       "      <td>中国图书馆学报</td>\n",
       "      <td>2013-07-15</td>\n",
       "      <td>51</td>\n",
       "      <td>2192</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>55</td>\n",
       "      <td>企业低碳绿色战略研究</td>\n",
       "      <td>吴维库; 李贞恩</td>\n",
       "      <td>经济纵横</td>\n",
       "      <td>2010-07-25</td>\n",
       "      <td>51</td>\n",
       "      <td>1207</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>995</th>\n",
       "      <td>1046</td>\n",
       "      <td>图书馆开源社区持续发展模式研究</td>\n",
       "      <td>陈大庆; 胡燕菘; 叶兰</td>\n",
       "      <td>图书馆学研究</td>\n",
       "      <td>2011-08-08</td>\n",
       "      <td>4</td>\n",
       "      <td>253</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>996</th>\n",
       "      <td>1047</td>\n",
       "      <td>科技期刊数字化独家经营之理性审视</td>\n",
       "      <td>梁华凝</td>\n",
       "      <td>科技与出版</td>\n",
       "      <td>2011-08-08</td>\n",
       "      <td>4</td>\n",
       "      <td>106</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>997</th>\n",
       "      <td>1048</td>\n",
       "      <td>湖北省大别山区特色农产品绿色营销策略分析</td>\n",
       "      <td>李爱玲</td>\n",
       "      <td>农业经济</td>\n",
       "      <td>2011-12-15</td>\n",
       "      <td>4</td>\n",
       "      <td>450</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>998</th>\n",
       "      <td>1049</td>\n",
       "      <td>基于当前我国中小企业营销的创新策略研究</td>\n",
       "      <td>蔡璐; 田芯; 王姝; 王红双</td>\n",
       "      <td>中国商贸</td>\n",
       "      <td>2011-12-01</td>\n",
       "      <td>4</td>\n",
       "      <td>158</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>999</th>\n",
       "      <td>1050</td>\n",
       "      <td>从比较优势到竞争优势——我国加工贸易发展战略研究</td>\n",
       "      <td>钟子建</td>\n",
       "      <td>生产力研究</td>\n",
       "      <td>2010-03-15</td>\n",
       "      <td>4</td>\n",
       "      <td>662</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1000 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Unnamed: 0                        篇名               作者       刊名  \\\n",
       "0            51          企业绿色营销系统的构建与绩效评价              司林胜     系统工程   \n",
       "1            52         试析技术性贸易壁垒及我国的应对策略              孙敬水  商业经济与管理   \n",
       "2            53       绿色管理:21世纪企业管理研究的新领域              刘承伟     齐鲁学刊   \n",
       "3            54          中国大陆地区古籍数字化问题及对策          高娟; 刘家真  中国图书馆学报   \n",
       "4            55                企业低碳绿色战略研究         吴维库; 李贞恩     经济纵横   \n",
       "..          ...                       ...              ...      ...   \n",
       "995        1046           图书馆开源社区持续发展模式研究     陈大庆; 胡燕菘; 叶兰   图书馆学研究   \n",
       "996        1047          科技期刊数字化独家经营之理性审视              梁华凝    科技与出版   \n",
       "997        1048      湖北省大别山区特色农产品绿色营销策略分析              李爱玲     农业经济   \n",
       "998        1049       基于当前我国中小企业营销的创新策略研究  蔡璐; 田芯; 王姝; 王红双     中国商贸   \n",
       "999        1050  从比较优势到竞争优势——我国加工贸易发展战略研究              钟子建    生产力研究   \n",
       "\n",
       "           发表时间  被引    下载  操作  \n",
       "0    2003-08-25  53  1087  下载  \n",
       "1    2002-06-25  52   508  下载  \n",
       "2    2001-07-30  52   524  下载  \n",
       "3    2013-07-15  51  2192  下载  \n",
       "4    2010-07-25  51  1207  下载  \n",
       "..          ...  ..   ...  ..  \n",
       "995  2011-08-08   4   253  下载  \n",
       "996  2011-08-08   4   106  下载  \n",
       "997  2011-12-15   4   450  下载  \n",
       "998  2011-12-01   4   158  下载  \n",
       "999  2010-03-15   4   662  下载  \n",
       "\n",
       "[1000 rows x 8 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "with pd.ExcelWriter('知网文章数据.xlsx',mode='w',engine=\"openpyxl\") as writer:  \n",
    "            df_url_out.to_excel(writer,sheet_name=\"知网\")\n",
    "display(df_url_out)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 导出refworks文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]\n"
     ]
    }
   ],
   "source": [
    "#由于数量限制在一次导出最多500篇，因此分开两批进行\n",
    "#第一批数据导出\n",
    "pages = list(range(1,11))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {},
   "outputs": [],
   "source": [
    "#回到第一页\n",
    "driver.find_element_by_id('total').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//*[@id=\"gridTable\"]/div[1]/div[2]/div[1]/a').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_choose(pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "        全选 = driver.find_element_by_id('selectCheckAll1')\n",
    "        全选.click()\n",
    "        time.sleep(30+20*random())\n",
    "        跳转 = driver.find_element_by_id('PageNext')\n",
    "        跳转.click()\n",
    "        time.sleep(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t"
     ]
    }
   ],
   "source": [
    "process_choose(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {},
   "outputs": [],
   "source": [
    "#选择refworks导出\n",
    "driver.find_element_by_xpath('//*[@id=\"batchOpsBox\"]/li[2]/i').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//*[@id=\"batchOpsBox\"]/li[2]/ul/li[1]/i').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//*[@id=\"batchOpsBox\"]/li[2]/ul/li[1]/ul/li[8]/a').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CDwindow-5FAA3431997C505154A20D9AC73991CB',\n",
       " 'CDwindow-E433EC2C0179DB92B4DA470EDD3F719D']"
      ]
     },
     "execution_count": 156,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 切换窗口\n",
    "driver.window_handles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: DeprecationWarning: use driver.switch_to.window instead\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    }
   ],
   "source": [
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {},
   "outputs": [],
   "source": [
    "#下载TXT文件\n",
    "driver.find_element_by_xpath('//i[@class=\"icon icon-export\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "# 下载原文\n",
    "driver.switch_to_window(driver.window_handles[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//li[@class=\"bulkdownload export\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 190,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: DeprecationWarning: use driver.switch_to.window instead\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    }
   ],
   "source": [
    "driver.switch_to_window(driver.window_handles[2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 191,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//*[@id=\"btn-download-all\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 168,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: DeprecationWarning: use driver.switch_to.window instead\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    }
   ],
   "source": [
    "driver.switch_to_window(driver.window_handles[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 169,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//*[@id=\"gridTable\"]/div[1]/div[2]/div[1]/a').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 170,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[11, 12, 13, 14, 15, 16, 17, 18, 19, 20]\n"
     ]
    }
   ],
   "source": [
    "pages = list(range(11,21))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 171,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "11\t12\t13\t14\t15\t16\t17\t18\t19\t20\t"
     ]
    }
   ],
   "source": [
    "process_choose(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 172,
   "metadata": {},
   "outputs": [],
   "source": [
    "#选择refworks导出\n",
    "driver.find_element_by_xpath('//i[@class=\"icon-d\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 173,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//i[@class=\"icon-r\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 174,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//a[@exporttype=\"Refworks\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 175,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CDwindow-5FAA3431997C505154A20D9AC73991CB',\n",
       " 'CDwindow-E433EC2C0179DB92B4DA470EDD3F719D',\n",
       " 'CDwindow-13D693FF5C84CFCFA13F243C6D61DAEA',\n",
       " 'CDwindow-DBBD870ECD72179E927F75A58F21DDDD']"
      ]
     },
     "execution_count": 175,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "driver.window_handles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 178,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: DeprecationWarning: use driver.switch_to.window instead\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    }
   ],
   "source": [
    "driver.switch_to_window(driver.window_handles[3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 179,
   "metadata": {},
   "outputs": [],
   "source": [
    "#下载TXT文件\n",
    "driver.find_element_by_xpath('//i[@class=\"icon icon-export\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 184,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: DeprecationWarning: use driver.switch_to.window instead\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    }
   ],
   "source": [
    "driver.switch_to_window(driver.window_handles[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 186,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//li[@class=\"bulkdownload export\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 187,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CDwindow-5FAA3431997C505154A20D9AC73991CB',\n",
       " 'CDwindow-E433EC2C0179DB92B4DA470EDD3F719D',\n",
       " 'CDwindow-13D693FF5C84CFCFA13F243C6D61DAEA',\n",
       " 'CDwindow-DBBD870ECD72179E927F75A58F21DDDD',\n",
       " 'CDwindow-99C6A575D48CB7A1A4AB2B3843A67FFE',\n",
       " 'CDwindow-99C26FC6FAFAC5E0C4283685F960EABB']"
      ]
     },
     "execution_count": 187,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "driver.window_handles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 188,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: DeprecationWarning: use driver.switch_to.window instead\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    }
   ],
   "source": [
    "driver.switch_to_window(driver.window_handles[5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 189,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_id('btn-download-all').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
